@@ -264,8 +264,9 @@ module Agents |
||
| 264 | 264 |
error "Ignoring a non-HTTP url: #{url.inspect}"
|
| 265 | 265 |
return |
| 266 | 266 |
end |
| 267 |
- log "Fetching #{url}"
|
|
| 268 |
- response = faraday.get(url) |
|
| 267 |
+ uri = Utils.normalize_uri(url) |
|
| 268 |
+ log "Fetching #{uri}"
|
|
| 269 |
+ response = faraday.get(uri) |
|
| 269 | 270 |
raise "Failed: #{response.inspect}" unless response.success?
|
| 270 | 271 |
|
| 271 | 272 |
interpolation_context.stack {
|
@@ -303,7 +304,7 @@ module Agents |
||
| 303 | 304 |
interpolated['extract'].keys.each do |name| |
| 304 | 305 |
result[name] = output[name][index] |
| 305 | 306 |
if name.to_s == 'url' |
| 306 |
- result[name] = (response.env[:url] + result[name]).to_s |
|
| 307 |
+ result[name] = (response.env[:url] + Utils.normalize_uri(result[name])).to_s |
|
| 307 | 308 |
end |
| 308 | 309 |
end |
| 309 | 310 |
|
@@ -21,6 +21,18 @@ module Utils |
||
| 21 | 21 |
end |
| 22 | 22 |
end |
| 23 | 23 |
|
| 24 |
+ def self.normalize_uri(uri) |
|
| 25 |
+ begin |
|
| 26 |
+ URI(uri) |
|
| 27 |
+ rescue URI::Error |
|
| 28 |
+ URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
|
|
| 29 |
+ unsafe.bytes.each_with_object(String.new) { |uc, s|
|
|
| 30 |
+ s << sprintf('%%%02X', uc)
|
|
| 31 |
+ } |
|
| 32 |
+ }.force_encoding(Encoding::US_ASCII)) |
|
| 33 |
+ end |
|
| 34 |
+ end |
|
| 35 |
+ |
|
| 24 | 36 |
def self.interpolate_jsonpaths(value, data, options = {})
|
| 25 | 37 |
if options[:leading_dollarsign_is_jsonpath] && value[0] == '$' |
| 26 | 38 |
Utils.values_at(data, value).first.to_s |
@@ -0,0 +1,17 @@ |
||
| 1 |
+<html> |
|
| 2 |
+ <head> |
|
| 3 |
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
|
| 4 |
+ <title>test</title> |
|
| 5 |
+ </head> |
|
| 6 |
+ <body> |
|
| 7 |
+ <ul> |
|
| 8 |
+ <li><a href="http://google.com">google</a></li> |
|
| 9 |
+ <li><a href="https://www.google.ca/search?q=some query">broken</a></li> |
|
| 10 |
+ <li><a href="https://www.google.ca/search?q=some%20query">escaped</a></li> |
|
| 11 |
+ <li><a href="http://ko.wikipedia.org/wiki/위키백과:대문">unicode url</a></li> |
|
| 12 |
+ <li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li> |
|
| 13 |
+ <li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li> |
|
| 14 |
+ <li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li> |
|
| 15 |
+ </ul> |
|
| 16 |
+ </body> |
|
| 17 |
+</html> |
@@ -911,4 +911,67 @@ fire: hot |
||
| 911 | 911 |
end |
| 912 | 912 |
end |
| 913 | 913 |
end |
| 914 |
+ |
|
| 915 |
+ describe "checking urls" do |
|
| 916 |
+ before do |
|
| 917 |
+ stub_request(:any, /example/). |
|
| 918 |
+ to_return(:body => File.read(Rails.root.join("spec/data_fixtures/urlTest.html")), :status => 200)
|
|
| 919 |
+ @valid_options = {
|
|
| 920 |
+ 'name' => "Url Test", |
|
| 921 |
+ 'expected_update_period_in_days' => "2", |
|
| 922 |
+ 'type' => "html", |
|
| 923 |
+ 'url' => "http://www.example.com", |
|
| 924 |
+ 'mode' => 'all', |
|
| 925 |
+ 'extract' => {
|
|
| 926 |
+ 'url' => { 'css' => "a", 'value' => "@href" },
|
|
| 927 |
+ } |
|
| 928 |
+ } |
|
| 929 |
+ @checker = Agents::WebsiteAgent.new(:name => "ua", :options => @valid_options) |
|
| 930 |
+ @checker.user = users(:bob) |
|
| 931 |
+ @checker.save! |
|
| 932 |
+ end |
|
| 933 |
+ |
|
| 934 |
+ describe "#check" do |
|
| 935 |
+ before do |
|
| 936 |
+ expect { @checker.check }.to change { Event.count }.by(7)
|
|
| 937 |
+ @events = Event.last(7) |
|
| 938 |
+ end |
|
| 939 |
+ |
|
| 940 |
+ it "should check hostname" do |
|
| 941 |
+ event = @events[0] |
|
| 942 |
+ expect(event.payload['url']).to eq("http://google.com")
|
|
| 943 |
+ end |
|
| 944 |
+ |
|
| 945 |
+ it "should check unescaped query" do |
|
| 946 |
+ event = @events[1] |
|
| 947 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query")
|
|
| 948 |
+ end |
|
| 949 |
+ |
|
| 950 |
+ it "should check properly escaped query" do |
|
| 951 |
+ event = @events[2] |
|
| 952 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=some%20query")
|
|
| 953 |
+ end |
|
| 954 |
+ |
|
| 955 |
+ it "should check unescaped unicode url" do |
|
| 956 |
+ event = @events[3] |
|
| 957 |
+ expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
|
|
| 958 |
+ end |
|
| 959 |
+ |
|
| 960 |
+ it "should check unescaped unicode query" do |
|
| 961 |
+ event = @events[4] |
|
| 962 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
|
|
| 963 |
+ end |
|
| 964 |
+ |
|
| 965 |
+ it "should check properly escaped unicode url" do |
|
| 966 |
+ event = @events[5] |
|
| 967 |
+ expect(event.payload['url']).to eq("http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
|
|
| 968 |
+ end |
|
| 969 |
+ |
|
| 970 |
+ it "should check properly escaped unicode query" do |
|
| 971 |
+ event = @events[6] |
|
| 972 |
+ expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
|
|
| 973 |
+ end |
|
| 974 |
+ |
|
| 975 |
+ end |
|
| 976 |
+ end |
|
| 914 | 977 |
end |